library(ggplot2)
library(plyr)
library(dplyr) 
library(tm)
library("reshape2")
#install.packages("vars")
library(vars)
#install.packages("tseries")
library(tseries)
setwd("/Users/zhangweicen/Desktop/3rd semester/thesis")
business=read.csv("business.csv",head=T)
business$date <-substr(business$date, 1, 10) 
ss <- split.data.frame(business,business$date)
##dictionary
dictionary=scan('DictionaryofAffect.txt',what='character')
##read in dictionary
dictionary=matrix(dictionary,ncol=4,byrow=T)
##w=word, ee=pleasantness, aa=activation, ii=imagery
colnames(dictionary)=c("w","ee","aa","ii")
##calculate 20% pleasant words and 10% unpleasant words
unpleasant=dictionary[order(dictionary[,2],dictionary[,3]),][1:floor(dim(dictionary)[1]*0.2),]
pleasant=dictionary[order(dictionary[,2],dictionary[,3],decreasing = T),][1:floor(dim(dictionary)[1]*0.2),]
pleasant=data.frame(pleasant)
unpleasant=data.frame(unpleasant)
###example of words 
View(cbind(head(pleasant[order(pleasant$w),],10),head(unpleasant[order(unpleasant$w),],10)))
######score function
score=function(data){
  ##record sentiment score as NA if no observation in a day
  if(length(data)==0){
    sentimentscore=c(NA,NA)}
  else{
    ##transform data into corpus
    make.corpus=function(data){
      corpus=VCorpus(VectorSource(data$snippet))
      toSpace <- content_transformer (function(x, pattern) gsub(pattern, " ", x))
      corpus <- tm_map(corpus, toSpace, "/|@|\\|")
      corpus=tm_map(corpus, tolower)
      corpus=tm_map(corpus,removePunctuation)
      corpus=tm_map(corpus,stripWhitespace)
      corpus=tm_map(corpus,removeNumbers)
      return(corpus)
    }
    corpus=make.corpus(data)
    #split sentences into words
    words=vector("list",length(corpus))
    for(i in 1:length(corpus)){
      words[[i]]=strsplit(sub("^\\s+", "", corpus[[i]]), " +")
    }
    # unlist sublist so that 1 list represent 1 observation
    words=lapply(words, unlist)
    ##calculate the words for certain day
    words_byday=unlist(words)
    length(words_byday)
    ##count frequency of all words in words_byday
    freq.words_byday=data.frame(table(words_byday))
    ##count pleasant words in words_byday
    pleasant.score=vector()
    count.pleasant=merge(x=pleasant,y=freq.words_byday,by.x="w",by.y="words_byday",all=F,all.x=T,all.y=F)
    count.pleasant[is.na(count.pleasant)]=0
    ##calculate pleasant.score as the percentage of pleasant words in all words
    pleasant.score=sum(count.pleasant[c("Freq")])/length(words_byday)
    ###count pleasant words in words_byday
    unpleasant.score=vector()
    freq.words_byday=data.frame(table(words_byday))
    count.unpleasant=merge(x=unpleasant,y=freq.words_byday,by.x="w",by.y="words_byday",all=F,all.x=T,all.y=F)
    count.unpleasant[is.na(count.unpleasant)]=0
    ##calculate pleasant.score as the percentage of pleasant words in all words
    upleasant.score=sum(count.unpleasant[c("Freq")])/length(words_byday)
    sentimentscore=c(pleasant.score,upleasant.score)
  }
  ##output sentimentscore
  return (sentimentscore)
}

options(digits=5)
positive=matrix(nrow=length(ss),ncol=2)
negative=matrix(nrow=length(ss),ncol=2)
for (i in 1:length(ss)){
  positive[i,2]=as.numeric(score(ss[[i]])[1])
  positive[i,1]=ss[[i]]$date[1]
  negative[i,2]=as.numeric(score(ss[[i]])[2])
  negative[i,1]=ss[[i]]$date[1]
  i=i+1
}

index=read.csv("index.csv",head=T)
index$Date= as.Date(index$Date, "%m/%d/%Y")
pos=data.frame(Date=as.Date(positive[,1]),pos.score=as.numeric(positive[,2]))
neg=data.frame(Date=as.Date(negative[,1]),neg.score=as.numeric(negative[,2]))
data.positive=merge(x=pos,y=index,by.x="Date",by.y="Date",all=F,all.x=F,all.y=F)
data=merge(x=neg,y=data.positive,by.x="Date",by.y="Date",all=F,all.x=F,all.y=F)
#write.table(data,"data.csv",sep=",")
#data=read.csv("data.csv",head=T)
var=data.frame(negative=data$neg.score,positive=data$pos,Return=data$Return,Volume=scale(log(as.numeric(data$Volume)),center=TRUE,scale=F))
library(psych)
describe1=describe(var)
######################
library(stringr)
date.char=as.character(as.Date(data$Date, "%m/%d/%Y"), "%Y%m%d") 
yr=data.frame(yr=as.factor(str_sub(date.char, start=-6,end= -5)))
yr$y2008 = ifelse((yr$yr=="08"), 1, 0)
yr$y2009 = ifelse((yr$yr=="09"), 1, 0)
yr$y2010 = ifelse((yr$yr=="10"), 1, 0)
yr$y2011 = ifelse((yr$yr=="11"), 1, 0)
yr$y2012 = ifelse((yr$yr=="12"), 1, 0)
yr=yr[,-1]
View(yr)
####lag selection
VARselect(var,lag.max=10)
###VAR
library(vars)
estimate=VAR(var,p=6,exogen=yr)
serial.test(estimate,lags.pt=8,type="PT.asymptotic")
serial.test(estimate,lags.pt=20,type="PT.asymptotic")
par(mfrow = c(2,2))
acf(residuals(estimate)[,1], xlim = c(1,20), type ="correlation", col = "red", lwd = 2,main="DOA negative residual")
acf(residuals(estimate)[,2], xlim = c(1,20), type ="correlation", col = "red", lwd = 2,main="DOA positive residual")
acf(residuals(estimate)[,3], xlim = c(1,20), type ="correlation", col = "red", lwd = 2,main="Return residual")
acf(residuals(estimate)[,4], xlim = c(1,20), type ="correlation", col = "red", lwd = 2,main="Volume residual")

####unit root test
#null hypothesis is the presence of unit root
adf.test(var$negative)
adf.test(var$positive)
adf.test(var$Return)
adf.test(var$Volume)
####autocorrelation test
par(mfrow = c(3,2))
acf(var$negative, xlim = c(1,8), type ="correlation", col = "red", lwd = 2,main="DOA negative")
acf(var$positive, xlim = c(1,8), type ="correlation", col = "red", lwd = 2,main="DOA positive")
acf(var.new$negative, xlim = c(1,8), type ="correlation", col = "red", lwd = 2,main="SLD negative")
acf(var.new$positive, xlim = c(1,8), type ="correlation", col = "red", lwd = 2,main="SLD positive")
acf(var$Return, xlim = c(1,8), type ="correlation", col = "red", lwd = 2,main="return")
acf(var$Volume, xlim = c(1,8), type ="correlation", col = "red", lwd = 2,main="Log(volume)")
#########################################################
######################new dictionary SLD #####################
unpleasant.new=as.vector(scan('lexicon negative.txt',what='character'))
pleasant.new=as.vector(scan('lexicon positive.txt',what='character'))
######score function
score.new=function(data){
  ##record sentiment score as NA if no observation in a day
  if(length(data)==0){
    sentimentscore=c(NA,NA)}
  else{
    ##transform data into corpus
    make.corpus=function(data){
      corpus=VCorpus(VectorSource(data$snippet))
      toSpace <- content_transformer (function(x, pattern) gsub(pattern, " ", x))
      corpus <- tm_map(corpus, toSpace, "/|@|\\|")
      corpus=tm_map(corpus, tolower)
      corpus=tm_map(corpus,removePunctuation)
      corpus=tm_map(corpus,stripWhitespace)
      corpus=tm_map(corpus,removeNumbers)
      return(corpus)
    }
    corpus=make.corpus(data)
    #split sentences into words
    words=vector("list",length(corpus))
    for(i in 1:length(corpus)){
      words[[i]]=strsplit(sub("^\\s+", "", corpus[[i]]), " +")
    }
    # unlist sublist so that 1 list represent 1 observation
    words=lapply(words, unlist)
    ##calculate the words for certain day
    words_byday=unlist(words)
    length(words_byday)
    ##count frequency of all words in words_byday
    freq.words_byday=data.frame(table(words_byday))
    ##count pleasant words in words_byday
    pleasant.score=vector()
    count.pleasant=merge(x=pleasant.new,y=freq.words_byday,by.x="x",by.y="words_byday",all=F,all.x=T,all.y=F)
    count.pleasant[is.na(count.pleasant)]=0
    ##calculate pleasant.score as the percentage of pleasant words in all words
    pleasant.score=sum(count.pleasant[c("Freq")])/length(words_byday)
    ###count pleasant words in words_byday
    unpleasant.score=vector()
    freq.words_byday=data.frame(table(words_byday))
    count.unpleasant=merge(x=unpleasant.new,y=freq.words_byday,by.x="x",by.y="words_byday",all=F,all.x=T,all.y=F)
    count.unpleasant[is.na(count.unpleasant)]=0
    ##calculate pleasant.score as the percentage of pleasant words in all words
    upleasant.score=sum(count.unpleasant[c("Freq")])/length(words_byday)
    sentimentscore=c(pleasant.score,upleasant.score)
  }
  ##output sentimentscore
  return (sentimentscore)
}

options(digits=5)
positive.new=matrix(nrow=length(ss),ncol=2)
negative.new=matrix(nrow=length(ss),ncol=2)
for (i in 1:length(ss)){
  positive.new[i,2]=as.numeric(score.new(ss[[i]])[1])
  positive.new[i,1]=ss[[i]]$date[1]
  negative.new[i,2]=as.numeric(score.new(ss[[i]])[2])
  negative.new[i,1]=ss[[i]]$date[1]
  i=i+1
}

index <- read.table(file.choose(),sep=",",header=T)##
index$Date= as.Date(index$Date, "%m/%d/%Y")
pos.new=data.frame(Date=as.Date(positive.new[,1]),pos.score=as.numeric(positive.new[,2]))
neg.new=data.frame(Date=as.Date(negative.new[,1]),neg.score=as.numeric(negative.new[,2]))
data.positive.new=merge(x=pos.new,y=index,by.x="Date",by.y="Date",all=F,all.x=F,all.y=F)
#data.negative.new=merge(x=neg.new,y=index,by.x="Date",by.y="Date",all=F,all.x=F,all.y=F)
data.new=merge(x=neg.new,y=data.positive.new,by.x="Date",by.y="Date",all=F,all.x=F,all.y=F)
write.table(data.new,"data.new.csv",sep=",")
var.new=data.frame(negative=data.new$neg.score,positive=data.new$pos,Return=data.new$Return,Volume=scale(log(data.new$Volume),center=TRUE,scale=FALSE))
describe2=describe(var.new)
VARselect(var.new,lag.max=10)
estimate.new=VAR(var.new,p=6,exogen=yr)
summary(estimate.new)
library(psych)
describe(var.new)
serial.test(estimate.new,lags.pt=8)
serial.test(estimate.new,lags.pt=20)
#residual graph
par(mfrow = c(2,2))
acf(residuals(estimate.new)[,1], xlim = c(1,20), type ="correlation", col = "red", lwd = 2,main="SLD negative residual")
acf(residuals(estimate.new)[,2], xlim = c(1,20), type ="correlation", col = "red", lwd = 2,main="SLD positive residual")
acf(residuals(estimate.new)[,3], xlim = c(1,20), type ="correlation", col = "red", lwd = 2,main="Return residual")
acf(residuals(estimate.new)[,4], xlim = c(1,20), type ="correlation", col = "red", lwd = 2,main="Volume residual")

####unit root test
#null hypothesis is the presence of unit root
adf.test(var.new$negative)
adf.test(var.new$positive)

####autocorrelation test
acf(var.new$negative, xlim = c(1,10), type ="correlation", col = "red", lwd = 2,main="SLD negative")
acf(var.new$positive, xlim = c(1,10), type ="correlation", col = "red", lwd = 2,main="SLD positive")

